
library(dplyr)
library(readr)
library(assertr)
library(tidyr)

setwd('./NC Data')

sumvars = c("lag_mathscore", "lag_mathscore_sq", "lag_readingscore", "lag_readingscore_sq",
            "female", "race_asian", "race_black", "race_hisp", "race_other", "poverty", "lep")

stdnt = read_rds('student_all.rds') %>%
  filter(grade %in% c(4,5)) %>%
  verify(!is.na(lea) & !is.na(schlcode)) %>%
  group_by(lea) %>%
  mutate(districtid = cur_group_id()) %>%
  group_by(lea,schlcode) %>% 
  mutate(schoolid = cur_group_id()) %>%
  ungroup() %>%
  select(-c(lea,schlcode)) %>%
  verify(!is.na(sex) & !is.na(race)) %>% 
  mutate(female = sex=='F',
         race_asian = race=='ASIA',
         race_black = race=='BLCK',
         race_hisp = race=='HISP',
         race_other = race %in% c('AMIN','MULT','P'),
         across(c('female','race_asian','race_black','race_hisp','race_other'),as.numeric)) %>%
  verify((as.numeric(race=='WHTE'))==(1 - (race_asian + race_black + race_hisp + race_other))) %>%
  select(-c(sex,race)) %>%
  rename(studentid = mastid,
         schgrade = grade,
         mathscore = ma_score,
         readingscore = rd_score,
         lag_mathscore = lag_ma_score,
         lag_readingscore = lag_rd_score) %>%
  mutate(lag_mathscore_sq = lag_mathscore^2,
         lag_readingscore_sq = lag_readingscore^2) %>%
  mutate(across(c('female','race_black','poverty','lep'), 
                list(lag_mathscore = ~ .x*lag_mathscore,
                     lag_readingscore = ~ .x*lag_readingscore), .names = "{.fn}X{.col}"),
         across(c('race_black','poverty','lep'), ~ .x*female, .names = "femaleX{.col}"),
         across(c('poverty','lep'), ~ .x*race_black, .names = "race_blackX{.col}"),
         povertyXlep = lep*poverty) %>%
  fastDummies::dummy_cols(c("schgrade","year")) %>%
  rename_with(~ gsub('^schgrade_','schgrd',.x), matches('^schgrade_\\d$')) %>%
  rename_with(~ gsub('^year_\\d{2}','yr',.x), matches('^year_\\d{4}$'))

tchr = stdnt %>%
  pivot_longer(cols = c('teachid_math','teachid_reading'), names_to = 'sbj', values_to = 'teachid') %>%
  filter(!is.na(teachid)) %>%
  group_by(year, teachid) %>%
  summarize(moved = as.numeric(min(schoolid)!=max(schoolid)),
            n_math = length(studentid[sbj=='teachid_math']),
            n_reading = length(studentid[sbj=='teachid_reading']),
            n_both = length(intersect(studentid[sbj=='teachid_math'],studentid[sbj=='teachid_reading'])), 
            across(all_of(sumvars), ~ mean(.x, na.rm = TRUE), .names = "teacher_{.col}"), .groups = 'drop') %>%
  mutate(tch_valid = (moved==0 & pmax(n_math,n_reading)<=50 & n_both>=5),
         teacher_n = (n_math + n_reading)/2,
         teacher_n_sq = teacher_n^2) %>%
  left_join(read_rds('teacher_all.rds'), by = c('year','teachid')) %>%
  mutate(tch_fem = as.numeric(tch_sex=='F'),
         tch_race_black = as.numeric(tch_eth=='B')) %>%
  select(-c(moved,n_math,n_reading,n_both,tch_sex,tch_eth)) %>%
  rename(teacherid = teachid)

district_sum = stdnt %>%
  group_by(year,districtid) %>%
  summarize(across(all_of(sumvars), ~ mean(.x, na.rm = TRUE), .names = "district_{.col}"),
            district_n = n()/1000,
            district_n_sq = district_n^2, .groups = 'drop') 
school_sum = stdnt %>%
  group_by(year,districtid,schoolid) %>%
  summarize(across(all_of(sumvars), ~ mean(.x, na.rm = TRUE), .names = "school_{.col}"),
            school_n = n()/100,
            school_n_sq = school_n^2, .groups = 'drop') 

stdnt2 = stdnt %>%
  mutate(teacherid = case_when(!is.na(teachid_math) & teachid_math==teachid_reading ~ teachid_math)) %>%
  left_join(tchr, by = c('year','teacherid')) %>%
  mutate(ss = case_when(is.na(teachid_math) & is.na(teachid_reading) ~ '1) drop, missing both math and reading teacher',
                        is.na(teachid_math) | is.na(teachid_reading) ~ '2) drop, missing math OR reading teacher',
                        teachid_math!=teachid_reading ~ '3) drop, math and reading teacher not the same',
                        is.na(mathscore) & is.na(readingscore) ~ '5) drop, is missing both outcomes',
                        if_any(c(sumvars,'tch_exp'), ~ is.na(.x)) ~ '6) drop, has missing RHS variables',
                        !tch_valid ~ '7) teacher moved or has more than 50 or less than 5 students',
                        tch_valid ~ 'In Sample',
                        TRUE ~ NA))

count(stdnt2,ss) %>% mutate(pct = 100*n/sum(n))

data = stdnt2 %>%
  filter(ss=='In Sample') %>%
  select(-c(ss,teachid_math,teachid_reading,tch_valid)) %>%
  relocate(year,districtid,schoolid,teacherid,studentid,schgrade,mathscore,readingscore) %>%
  left_join(school_sum, by = c('year','districtid','schoolid')) %>%
  left_join(district_sum, by = c('year','districtid')) %>%
  ungroup() %>%
  arrange(year,districtid,schoolid,teacherid,studentid) %>%
  mutate(sid = row_number())
    
ct_na = sapply(data, function(x) sum(is.na(x)))
ct_na[ct_na!=0]

write_csv(data,'SCC_4to5.csv')

  